{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "8a5fab7d",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "import findspark\n",
    "findspark.init('/opt/spark')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "52921414",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ef9d4d47",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting default log level to \"WARN\".\n",
      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
     ]
    }
   ],
   "source": [
    "spark = SparkSession.builder\\\n",
    "    .enableHiveSupport()\\\n",
    "    .getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "55fad11f",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spark.sql(\"CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive\")\n",
    "spark.sql(\"LOAD DATA LOCAL INPATH '/opt/spark/examples/src/main/resources/kv1.txt' INTO TABLE src\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2db995bd",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "[Stage 0:>                                                          (0 + 1) / 1]\r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+-------+\n",
      "|key|  value|\n",
      "+---+-------+\n",
      "|238|val_238|\n",
      "| 86| val_86|\n",
      "|311|val_311|\n",
      "| 27| val_27|\n",
      "|165|val_165|\n",
      "|409|val_409|\n",
      "|255|val_255|\n",
      "|278|val_278|\n",
      "| 98| val_98|\n",
      "|484|val_484|\n",
      "|265|val_265|\n",
      "|193|val_193|\n",
      "|401|val_401|\n",
      "|150|val_150|\n",
      "|273|val_273|\n",
      "|224|val_224|\n",
      "|369|val_369|\n",
      "| 66| val_66|\n",
      "|128|val_128|\n",
      "|213|val_213|\n",
      "+---+-------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "                                                                                \r"
     ]
    }
   ],
   "source": [
    "spark.sql(\"SELECT * FROM src\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "01d88f94",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "[Stage 1:=============================>                             (1 + 1) / 2]\r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+\n",
      "|count(1)|\n",
      "+--------+\n",
      "|     500|\n",
      "+--------+\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "                                                                                \r"
     ]
    }
   ],
   "source": [
    "spark.sql(\"SELECT COUNT(*) FROM src\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "636e1ccf",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "sqlDF = spark.sql(\"SELECT key, value FROM src WHERE key < 10 ORDER BY key\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "87c682d7",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Key: 0, Value: val_0\n",
      "Key: 0, Value: val_0\n",
      "Key: 0, Value: val_0\n",
      "Key: 2, Value: val_2\n",
      "Key: 4, Value: val_4\n",
      "Key: 5, Value: val_5\n",
      "Key: 5, Value: val_5\n",
      "Key: 5, Value: val_5\n",
      "Key: 8, Value: val_8\n",
      "Key: 9, Value: val_9\n"
     ]
    }
   ],
   "source": [
    "stringsDS = sqlDF.rdd.map(lambda row: \"Key: %d, Value: %s\" % (row.key, row.value))\n",
    "for record in stringsDS.collect():\n",
    "    print(record)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9987c966",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "from pyspark.sql import Row\n",
    "\n",
    "Record = Row(\"key\", \"value\")\n",
    "recordsDF = spark.createDataFrame([Record(i, \"val_\" + str(i)) for i in range(1, 101)])\n",
    "recordsDF.createOrReplaceTempView(\"records\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "3981c7d8",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+------+---+------+\n",
      "|key| value|key| value|\n",
      "+---+------+---+------+\n",
      "|  2| val_2|  2| val_2|\n",
      "|  4| val_4|  4| val_4|\n",
      "|  5| val_5|  5| val_5|\n",
      "|  5| val_5|  5| val_5|\n",
      "|  5| val_5|  5| val_5|\n",
      "|  8| val_8|  8| val_8|\n",
      "|  9| val_9|  9| val_9|\n",
      "| 10|val_10| 10|val_10|\n",
      "| 11|val_11| 11|val_11|\n",
      "| 12|val_12| 12|val_12|\n",
      "| 12|val_12| 12|val_12|\n",
      "| 15|val_15| 15|val_15|\n",
      "| 15|val_15| 15|val_15|\n",
      "| 17|val_17| 17|val_17|\n",
      "| 18|val_18| 18|val_18|\n",
      "| 18|val_18| 18|val_18|\n",
      "| 19|val_19| 19|val_19|\n",
      "| 20|val_20| 20|val_20|\n",
      "| 24|val_24| 24|val_24|\n",
      "| 24|val_24| 24|val_24|\n",
      "+---+------+---+------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "spark.sql(\"SELECT * FROM records r JOIN src s ON r.key = s.key\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c35ff53b",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "spark.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "373cbe9f",
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
